In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('srrl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [3]:
len(nsrdb.df)
Out[3]:
315552

Train/test on NSRDB data to find optimal parameters

Default classifier

In [4]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [5]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [6]:
clf = ensemble.RandomForestClassifier(random_state=42)
In [7]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [8]:
feature_cols = [
    'tfn',
    'abs_ideal_ratio_diff grad',
    'abs_ideal_ratio_diff grad mean', 
    'abs_ideal_ratio_diff grad std',
    'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
    'GHI Clearsky GHI pvlib line length ratio',
    'GHI Clearsky GHI pvlib ratio', 
    'GHI Clearsky GHI pvlib ratio mean',
    'GHI Clearsky GHI pvlib ratio std',
    'GHI Clearsky GHI pvlib diff',
    'GHI Clearsky GHI pvlib diff mean', 
    'GHI Clearsky GHI pvlib diff std'
]

target_cols = ['sky_status']
import seaborn as snspplot = sns.pairplot(train.df[feature_cols + target_cols], hue='sky_status')from sklearn.cluster import KMeans, DBSCAN from sklearn.preprocessing import StandardScaler X = train.df[feature_cols + ['GHI', 'Clearsky GHI pvlib']].values X_std = StandardScaler().fit_transform(X) kmeans = DBSCAN().fit(X_std) # kmeans = KMeans(n_clusters=4, random_state=0).fit(X_std) train.df['sky_status_km'] = 0 train.df['sky_status_km'] = kmeans.labels_ pd.unique(train.df['sky_status_km'])train.df['sky_status_km'] = (train.df['sky_status_km'] == 0) | (train.df['sky_status_km'] == 0) x = train.df.index trace1 = go.Scatter(x=x, y=train.df['GHI']) trace2 = go.Scatter(x=x, y=train.df['Clearsky GHI pvlib']) # trace3 = go.Scatter(x=train.df[train.df['sky_status']].index, # y=train.df[train.df['sky_status']]['GHI'], name='NSRBD', mode='markers') trace3 = go.Scatter(x=train.df[train.df['sky_status'] & ~train.df['sky_status_km']].index, y=train.df[train.df['sky_status'] & ~train.df['sky_status_km']]['GHI'], name='NSRBD', mode='markers') trace4 = go.Scatter(x=train.df[~train.df['sky_status'] & train.df['sky_status_km']].index, y=train.df[~train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='km', mode='markers') trace5 = go.Scatter(x=train.df[train.df['sky_status'] & train.df['sky_status_km']].index, y=train.df[train.df['sky_status'] & train.df['sky_status_km']]['GHI'], name='both', mode='markers') iplot([trace1, trace2, trace3, trace4, trace5])vis = visualize.Visualizer() vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
In [9]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [10]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
Out[10]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)
In [11]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

In [12]:
metrics.accuracy_score(test.df['sky_status'], pred)
Out[12]:
0.94874429223744294
In [13]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.show()
In [14]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [15]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Gridsearch

In [16]:
import warnings
with warnings.catch_warnings(): warnings.simplefilter('ignore') params={} params['max_depth'] = [4, 8, 12, 16] params['n_estimators'] = [64] params['class_weight'] = [None, 'balanced'] params['min_samples_leaf'] = [1, 2, 3] results = [] for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']): print('Params:') print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples)) train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('01-01-1999', '01-01-2014') utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True) test2 = cs_detection.ClearskyDetection(train.df) test2.trim_dates('01-01-2014', '01-01-2015') clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1, random_state=42) clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten()) print('\t Scores:') test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True) accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred) print('\t\t accuracy: {}'.format(accuracy_score)) f1_score = metrics.f1_score(test2.df['sky_status'], test_pred) print('\t\t f1:{}'.format(f1_score)) recall_score = metrics.recall_score(test2.df['sky_status'], test_pred) print('\t\t recall:{}'.format(recall_score)) precision_score = metrics.precision_score(test2.df['sky_status'], test_pred) print('\t\t precision:{}'.format(precision_score)) results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples, 'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})runs_df = pd.DataFrame(results)runs_df.to_csv('8_srrl_directional_features.csv')
In [17]:
runs_df = pd.read_csv('8_srrl_directional_features.csv')
In [18]:
runs_df[['accuracy', 'f1', 'recall', 'precision']].iplot(kind='box')
In [19]:
runs_df
Out[19]:
Unnamed: 0 accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
0 0 0.529737 NaN 0.324617 4 1 64 0.207656 0.743243
1 1 0.529737 NaN 0.324617 4 2 64 0.207656 0.743243
2 2 0.529737 NaN 0.324617 4 3 64 0.207656 0.743243
3 3 0.464212 balanced 0.348623 4 1 64 0.213842 0.942943
4 4 0.464212 balanced 0.348623 4 2 64 0.213842 0.942943
5 5 0.464212 balanced 0.348623 4 3 64 0.213842 0.942943
6 6 0.944635 NaN 0.810842 8 1 64 0.843750 0.780405
7 7 0.927968 NaN 0.767673 8 2 64 0.753251 0.782658
8 8 0.926941 NaN 0.765568 8 3 64 0.747496 0.784535
9 9 0.890297 balanced 0.730435 8 1 64 0.583072 0.977477
10 10 0.891781 balanced 0.733258 8 2 64 0.586409 0.978228
11 11 0.899600 balanced 0.747814 8 3 64 0.604964 0.978979
12 12 0.952968 NaN 0.840124 12 1 64 0.869478 0.812688
13 13 0.952568 NaN 0.838295 12 2 64 0.870303 0.808559
14 14 0.952740 NaN 0.839098 12 3 64 0.869863 0.810435
15 15 0.950342 balanced 0.853239 12 1 64 0.774816 0.949324
16 16 0.951199 balanced 0.855696 12 2 64 0.777369 0.951577
17 17 0.947260 balanced 0.846051 12 3 64 0.760635 0.953078
18 18 0.955023 NaN 0.848810 16 1 64 0.868132 0.830330
19 19 0.953196 NaN 0.842731 16 2 64 0.861569 0.824700
20 20 0.954509 NaN 0.845990 16 3 64 0.871764 0.821697
21 21 0.954167 balanced 0.856119 16 1 64 0.818992 0.896772
22 22 0.954224 balanced 0.858254 16 2 64 0.810955 0.911411
23 23 0.953653 balanced 0.857042 16 3 64 0.807029 0.913664

Best recall model

In [20]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [21]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [22]:
best_recall = runs_df.iloc[runs_df['recall'].idxmax()]
In [23]:
params_recall = best_recall[['max_depth', 'n_estimators', 'min_samples_leaf']].to_dict()
In [24]:
params_recall
Out[24]:
{'max_depth': 8, 'min_samples_leaf': 3, 'n_estimators': 64}
clf = ensemble.RandomForestClassifier(**params_recall, n_jobs=-1) clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred) vis = visualize.Visualizer() vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best accuracy model

In [25]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [26]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [27]:
best_accuracy = runs_df.iloc[runs_df['accuracy'].idxmax()]
In [28]:
print(best_accuracy.equals(best_recall))
False
In [29]:
params_accuracy = best_accuracy[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()
In [30]:
params_accuracy
Out[30]:
{'class_weight': nan,
 'max_depth': 16,
 'min_samples_leaf': 1,
 'n_estimators': 64}
In [31]:
params_accuracy['class_weight'] = None
clf = ensemble.RandomForestClassifier(**params_accuracy, n_jobs=-1) clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None)pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') vis.show()cm = metrics.confusion_matrix(test.df['sky_status'].values, pred) vis = visualize.Visualizer() vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

Best precision model

In [32]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [33]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [34]:
best_precision = runs_df.iloc[runs_df['precision'].idxmax()]
In [35]:
print(best_precision.equals(best_recall))
print(best_precision.equals(best_accuracy))
False
False
In [36]:
params_precision = best_precision[['max_depth', 'n_estimators', 'class_weight', 'min_samples_leaf']].to_dict()

Same model as accuracy.

Best f1 model

In [37]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [38]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [39]:
best_f1 = runs_df.iloc[runs_df['f1'].idxmax()]
In [40]:
print(best_f1.equals(best_recall))
print(best_f1.equals(best_accuracy))
print(best_f1.equals(best_precision))
False
False
False

Same model as best accuracy and precision - scroll up.

Train on all NSRDB data, test various freq of ground data

In [41]:
best_precision['class_weight'] = None

train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf = ensemble.RandomForestClassifier(**best_precision[['max_depth', 'class_weight', 'min_samples_leaf']].to_dict(), n_estimators=100)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
/Users/benellis/miniconda3/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[41]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [42]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [43]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('EST')
test = cs_detection.ClearskyDetection(ground.df)
In [44]:
test.trim_dates('10-01-2008', '11-01-2008')
In [45]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [46]:
test.df.keys()
Out[46]:
Index(['GHI', 'Clearsky GHI pvlib', 'sky_status pvlib', 'ghi_status', 'scale'], dtype='object')
In [47]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [48]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:336: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:343: RuntimeWarning:

Scaling did not converge.

In [49]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [50]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [51]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')
## 15 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-17-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')test.df = test.df[test.df.index.minute % 15 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 10 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 10 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 5 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-04-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 5 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-17-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')## 1 min freq ground dataground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz') ground.df.index = ground.df.index.tz_convert('MST') test = cs_detection.ClearskyDetection(ground.df)test.trim_dates('10-01-2015', '10-08-2015')test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') test.scale_by_irrad('Clearsky GHI pvlib')test.df = test.df[test.df.index.minute % 1 == 0]pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)train2 = cs_detection.ClearskyDetection(train.df) train2.trim_dates('10-01-2015', '10-08-2015') train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min')) train2.df['sky_status'] = train2.df['sky_status'].fillna(False)nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear') vis.show()probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas[:, 1] visualize.plot_ts_slider_highligther(test.df, prob='probas')# Save modelimport picklewith open('8_abq_direction_features_model.pkl', 'wb') as f: pickle.dump(clf, f)!ls *abq*# ConclusionIn general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.
In [ ]: